import pandas as pd
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import folium
from folium.plugins import FastMarkerCluster
from langdetect import detect
import nltk
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import os
import string
from dython import nominal
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
#shows 999 columns in the output rather than a collapsed scaled to fit version
pd.set_option('display.max_columns', 999)
#Load the data
berlin_data_raw = pd.read_csv('.../berlin-airbnb-data/berlin_listings_summary.csv')
# Inspect the data
berlin_data_raw.head(1)
Filter only those columns that are intended to be used
# Filter required columns
berlin_data = berlin_data_raw[['id', 'host_identity_verified','neighbourhood_group_cleansed',
'latitude','longitude','property_type','room_type','accommodates','bathrooms','bedrooms',
'beds', 'amenities','price','security_deposit', 'cleaning_fee','guests_included','extra_people','number_of_reviews',
'review_scores_rating','review_scores_accuracy','review_scores_cleanliness','review_scores_checkin',
'review_scores_communication','review_scores_location','review_scores_value','cancellation_policy']]
# check the result
berlin_data.head()
# data dimensions
print('The berlin Airbnb dataframe has {} rows and {} columns'.format(berlin_data.shape[0], berlin_data.shape[1]))
berlin_data = berlin_data.drop_duplicates()
print('The berlin Airbnb dataframe has {} rows and {} columns'.format(berlin_data.shape[0], berlin_data.shape[1]))
No duplicates found
# check the data types of each column for further usability
berlin_data.dtypes
Columns like Price, security deposit, cleaning fee, extra people charges are stored as objects rather than integer/floats and would need to be converted.
# Check for missing values
berlin_data.isnull().sum()
Imputations will be needed for multiple columns. Dropping them will be unproductive as although they are not significant in number compared to the entire data we would lose valuable information if dropped.
Lets begin by considering the columns based on price. These are 'security_deposit', 'cleaning_fee' and 'extra_people'
# inspect the price columns
berlin_data.iloc[100:110,12:17]
Price column has no empty values as observed previously. But values for Security Deposit and Cleaning Fee will need to be imputed. These values can be imputed to '0' since the owner may not charge these and has not specified them for his listing.
Next, the values will need to have the '$' removed and then ',' character removed before we convert the columns to a float value.
# replace NaN for all cols
berlin_data.price = berlin_data.price.replace(np.nan, '$0.00', regex=True)
berlin_data.security_deposit = berlin_data.security_deposit.replace(np.nan, '$0.00', regex=True)
berlin_data.cleaning_fee = berlin_data.cleaning_fee.replace(np.nan, '$0.00', regex=True)
berlin_data.extra_people = berlin_data.extra_people.replace(np.nan, '$0.00', regex=True)
# replace the '$' for all cols
berlin_data['price'] = (berlin_data['price'].str.split()).apply(lambda x: (x[0].replace('$', '')))
berlin_data['security_deposit'] = (berlin_data['security_deposit'].str.split()).apply(lambda x: (x[0].replace('$', '')))
berlin_data['cleaning_fee'] = (berlin_data['cleaning_fee'].str.split()).apply(lambda x: (x[0].replace('$', '')))
berlin_data['extra_people'] = (berlin_data['extra_people'].str.split()).apply(lambda x: (x[0].replace('$', '')))
# replace the ',' for all cols
berlin_data['price'] = (berlin_data['price'].str.split()).apply(lambda x: float(x[0].replace(',', '')))
berlin_data['security_deposit'] = (berlin_data['security_deposit'].str.split()).apply(lambda x: float(x[0].replace(',', '')))
berlin_data['cleaning_fee'] = (berlin_data['cleaning_fee'].str.split()).apply(lambda x: float(x[0].replace(',', '')))
berlin_data['extra_people'] = (berlin_data['extra_people'].str.split()).apply(lambda x: float(x[0].replace(',', '')))
# check the result
berlin_data.dtypes
# check for missing value imputations of the prices
berlin_data.isnull().sum()
The Price columns are rendered useable.
Now we will impute the values for bathrooms, bedrooms and beds.
# inspect the above columns to be imputed
berlin_data.iloc[1:10,7:11]
Using the groupby method to calculate the mode of values for each column (bathrooms, bedrooms and beds) against the value the listing 'accomodates' we can impute for these missing values.
# mode calculation for beds bedrooms and bathrooms [considering the lower of the mode values in the case of
# multiple modes]
berlin_data['berlin_bathrooms_mode'] = berlin_data.groupby(['accommodates'])['bathrooms'].transform(lambda x: pd.Series.mode(x)[0]) # calculate mode and use lower values
berlin_data['berlin_bedrooms_mode'] = berlin_data.groupby(['accommodates'])['bedrooms'].transform(lambda x: pd.Series.mode(x)[0])
berlin_data['berlin_beds_mode'] = berlin_data.groupby(['accommodates'])['beds'].transform(lambda x: pd.Series.mode(x)[0])
# replace the NaN with the calculated mode for all the desired columns
berlin_data.loc[berlin_data['bathrooms'].isnull(),'bathrooms'] = berlin_data['berlin_bathrooms_mode']
berlin_data.loc[berlin_data['bedrooms'].isnull(),'bedrooms'] = berlin_data['berlin_bedrooms_mode']
berlin_data.loc[berlin_data['beds'].isnull(),'beds'] = berlin_data['berlin_beds_mode']
# verifying for mode imputation (an example of accomodates = 3 & beds = NaN)
berlin_data.loc[berlin_data['id'] == 28365580] # imputed to 2 beds as per mode
# drop the calculated columns that have the mode values
berlin_data = berlin_data.drop(columns='berlin_bathrooms_mode')
berlin_data = berlin_data.drop(columns='berlin_bedrooms_mode')
berlin_data = berlin_data.drop(columns='berlin_beds_mode')
# verify the imputations and plan next step
berlin_data.isna().sum()
Now we are left to deal with the Reviews columns. We have a considerable amount of missing values for these columns. We could drop them but as a consequence we would lose valuable information present on the other columns.
The approriate action would be to replace the missing values to '0' as there arent any ratings registered for these listings.
We will leave the remaining columns without any imputations as the imputations would not affect our analysis.
# replacing NaN with 0
berlin_data.loc[berlin_data['review_scores_rating'].isnull(),'review_scores_rating'] = 0
# verify the imputations
berlin_data.isna().sum()
The below transformation might provide some additional insight into the distribution of our data.
They are calculations based off of how a traveller or group of travellers might consider accommodation price when planning a trip
We can use the price based columns to determine the minimum budget per night by adding the " 'price' + 'cleaning_fee' + 'security_deposit' ".
Use the above to determine the minimum expenditure when adding the costs of additional guests staying. Calculated as " 'minimum_budget' + ('accommodates' - 'guests_included' * 'extra_people') "
Further use the above to determine the cost per head " ('minimum_budget_inc_guests'/'accommodates') ".
The folowing transformations will be needed for the correlation analyis within our data:
Convert 'cancellation_policy' which is currently a nominal column into a ordinal column
Use the 'amenities' column to determine the number of amenties the listing offers.
Finally,
# add price, cleaning fee and security deposit as the minimum needed to stay one night
berlin_data['minimum_budget'] = berlin_data['price'] + berlin_data['cleaning_fee'] + berlin_data['security_deposit']
berlin_data = berlin_data.drop(columns = 'security_deposit')
# accommodates - guests already included in the price willl give us the additional people allowed which is
# multiplied by the cost stated for additional guest('extra_people')
berlin_data['guest_charge'] = (berlin_data['accommodates'] - berlin_data['guests_included'])*berlin_data['extra_people']
berlin_data['minimum_budget_inc_guests'] = (berlin_data['minimum_budget'] + berlin_data['guest_charge'])
berlin_data = berlin_data.drop(columns = 'guest_charge')
berlin_data = berlin_data.drop(columns = 'guests_included')
berlin_data = berlin_data.drop(columns = 'extra_people')
berlin_data.isna().sum()
berlin_data.describe()
Minimum_budget_inc_guests has negative values which might have arisen from faulty entries present in the data which caused our calculation to be incorrect
If the data provided is faulty we can ignore those rows with negative values by replacing them with '0', provided they are a small number of negatives.
# check the count of rows having negatives
berlin_data.loc[berlin_data['minimum_budget_inc_guests'] < 0, 'minimum_budget_inc_guests' ].count()
# replacing faulty values with 0.00
berlin_data.loc[berlin_data['minimum_budget_inc_guests'] < 0,'minimum_budget_inc_guests'] = 0.00
berlin_data.describe()
berlin_data['cost_per_head'] = berlin_data['minimum_budget_inc_guests'] / berlin_data['accommodates']
berlin_data.isna().sum()
berlin_data.cancellation_policy.unique()
berlin_data['cancel_index'] = berlin_data['cancellation_policy']
berlin_data.loc[berlin_data['cancel_index'] == 'flexible', 'cancel_index'] = 1
berlin_data.loc[berlin_data['cancel_index'] == 'moderate', 'cancel_index'] = 2
berlin_data.loc[berlin_data['cancel_index'] == 'strict_14_with_grace_period','cancel_index'] = 3
berlin_data.loc[berlin_data['cancel_index'] == 'super_strict_30','cancel_index'] = 4
berlin_data.loc[berlin_data['cancel_index'] == 'super_strict_60','cancel_index'] = 5
# verify
berlin_data.cancel_index.unique()
berlin_data['number_of_amenities'] = berlin_data['amenities'].apply(lambda x:len(x.split(',')))
# verify
berlin_data.head(1)
plt.figure(figsize=(15,5))
sns.set_style("whitegrid", {'axes.grid' : False})
sns.distplot(berlin_data.review_scores_rating,
kde_kws={'linewidth':1},
hist_kws={'alpha':.3})
plt.xlabel('\nRating score')
plt.ylabel('Count of rating\n')
plt.title('Distribution of review scores rating\n');
Since our data is significantly negatively skewed lets take the log of the data (without the zero values) and plot it again.
plt.figure(figsize=(15,5))
sns.set_style("whitegrid", {'axes.grid' : False})
# taking log of the data except 0's
sns.distplot(np.log1p(berlin_data.review_scores_rating[berlin_data['review_scores_rating'] > 0]),
kde_kws={'linewidth':1},
hist_kws={'alpha':.3})
plt.xlabel('\nRating score')
plt.ylabel('Count of rating\n')
plt.title('Log Transformed Distribution of review scores rating\n');
Log transforming does not help as there are present but very few reviews for the lower ranges.
Assigning a rating '1-5' to each listing based on their review score would have proved counter productive as the distributions above showed most of the listings are present in the upper and hence we would not achieve effective segregation of the listings based on their rating score.
Lets reconsider our strategy and view the data above 75 rating to see its distribution and then make our decision to subset the popular listings.
plt.figure(figsize=(15,5))
sns.set_style("whitegrid", {'axes.grid' : False})
sns.distplot(berlin_data.review_scores_rating[berlin_data['review_scores_rating'] > 75],
kde_kws={'linewidth':1},
hist_kws={'alpha':.3})
plt.xlabel('\nRating score')
plt.ylabel('Count of rating\n')
plt.title('Distribution of review scores rating\n');
Seems like most of the listings have scores that are concentrated between '80-100'.
Reconsidering our intital technique we'll select review_scores_ratings having score > 90 as high performing listings for our analysis.
With this we have ironed out our data and can start the analysis.
plt.figure(num=1, figsize=(15,6))
sns.set_style("whitegrid", {'axes.grid' : False})
plt.bar(x = berlin_data.property_type.value_counts().index,
height = berlin_data.property_type.value_counts().values)
plt.xlabel('Property Type')
plt.ylabel('Number Of Listings\n')
plt.title('Number of Listings per Property Type\n')
plt.xticks(rotation = 90);
Most of the Airbnb listings in Berlin during period our data is gathered are apartment listings with over 20000 of them with very small amounts of Condominiums, Lofts and Houses.
plt.figure(num=2, figsize=(7,5))
plt.bar(x = berlin_data.room_type.value_counts().index,
height = berlin_data.room_type.value_counts().values, color=('green','blue','red'))
plt.xlabel('\nRoom Type',fontsize=13)
plt.ylabel('Number Of Listings\n',fontsize=13)
plt.rc('xtick', labelsize=11)
plt.rc('ytick', labelsize=10)
plt.title('\nNumber of Listings per Room Type\n',fontsize=15);
#plt.xticks(rotation = 90);
Seems like shared rooms are an unpopular with Airbnb property listers.
Lets view the composition of the property types shown before for the room types above.
plt.figure(num=3);
prop = berlin_data.groupby(['property_type','room_type']).room_type.count()
prop = prop.unstack()
prop['total'] = prop.iloc[:,0:3].sum(axis = 1)
prop = prop.sort_values(by=['total'])
prop = prop[prop['total']>=300]
prop = prop.drop(columns=['total'])
prop.plot(kind='barh',stacked=True, color = ["g","b","r"], linewidth = 1, grid=True, figsize=(15,8), width=1);
plt.title('Composition of Property types\n', fontsize=15);
plt.xlabel('\nNumber of listings', fontsize=13);
plt.ylabel('');
plt.legend(loc = 4,prop = {"size" : 13});
plt.rc('ytick', labelsize=13);
plt.show();
All shared rooms are within appartments. Probably some room owners looking to share their rooms for extra income.
The Loft Property type is mainly being let out competely as the entire home with a small amount listed having private rooms.
Other Property types are almost equally let out as an entire property or as a Private Room.
plt.figure(num = 4);
red_square = dict(markerfacecolor='r', markeredgecolor='r', marker='.', markersize = 1)
berlin_data['price'].plot(kind='box', vert=False, xlim=(-10,300),flierprops=red_square, figsize=(15,3))
plt.rc('xtick', labelsize=11)
plt.show();
The prices have a very broad range as indicated by the outliers above. However we can ascertain that majority of the prices are lesser than $150.
plt.figure(num=5, figsize=(19, 3))
ax = sns.boxplot(x = 'price', y = 'room_type', data = berlin_data, palette = ['g','b','r'], fliersize = 1.5, linewidth=1)
plt.xlim(-10, 300)
plt.title('Price Range per Room Type\n', fontsize=15)
plt.xlabel('\nPrice', fontsize=14)
plt.ylabel('')
plt.rc('ytick', labelsize=14)
plt.show();
The room types majorly have prices below $150.
The shared room is expected to have a lower a price range than the other room types which is evident above. However the plotted outliers indicate a few pricey listings as well. These must be rooms that accomodates a larger number of people.
berlin_data_sharedRoom = berlin_data.loc[berlin_data['room_type'] == 'Shared room' ,:]
berlin_data_privateRoom = berlin_data.loc[berlin_data['room_type'] == 'Private room',:]
berlin_data_entireHome = berlin_data.loc[berlin_data['room_type'] == 'Entire home/apt',:]
berlin_data_sharedRoom = berlin_data.loc[berlin_data['price'] <= 150 ,:]
berlin_data_privateRoom = berlin_data.loc[berlin_data['price'] <= 150,:]
berlin_data_entireHome = berlin_data.loc[berlin_data['price'] <= 150,:]
# overiview of listings over Berlin map
latitudes = berlin_data['latitude'].tolist()
longitudes = berlin_data['longitude'].tolist()
locations = list(zip(latitudes, longitudes))
map_cluster = folium.Map(location=[52.509, 13.381], zoom_start=11)
FastMarkerCluster = FastMarkerCluster(data=locations).add_to(map_cluster);
map_cluster
Majority of listings located in the South East Region with Central Berlin having the bulk of listings.
plt.figure(num=6, figsize=(15,6))
plt.bar(x = berlin_data.neighbourhood_group_cleansed.value_counts().index,
height = berlin_data.neighbourhood_group_cleansed.value_counts().values)
plt.xlabel('\nNeighbourhood Region',fontsize=13)
plt.ylabel('Number Of Listings\n',fontsize=13)
plt.rc('xtick', labelsize=14)
plt.rc('ytick', labelsize=11)
plt.title('Number of Listings per Neighbourhood\n',fontsize=15);
plt.xticks(rotation = 90);
neighbourhood_price = pd.DataFrame(berlin_data.groupby(['neighbourhood_group_cleansed'])['price'].agg(np.median))
neighbourhood_price.reset_index(inplace=True)
# plot this
plt.figure(num=7)
neighbourhood_price.sort_values(by=['price'], ascending=True)\
.plot(kind='barh', x='neighbourhood_group_cleansed', y='price',
figsize=(11,7), legend = False)
plt.xlabel('\nMedian Price', fontsize=13)
plt.ylabel('Neighbourhood\n', fontsize=13)
plt.title('Median Prices per neighbourhood\n', fontsize=15);
Let's examine if we can discover any relations by plotting prices and number of reviews given by customers.
For this we'll use a scatterplot with near transparent data points. This will allow us to view the concentration of data points by causing more dense regions to stand out.
Each data point is a listing and we can infer the following:
We'll limit the observable price range to $300 as most prices fall within this range as observed before and it should include a few outlier cases as well.
plt.figure(num=8, figsize=(15,15), linewidth=1)
# consider only those data points that have reviews
berlin_data2 = berlin_data.loc[berlin_data['number_of_reviews'] > 0]
plt.subplot2grid((6, 6), (0, 0),rowspan=3, colspan=3)
ax = sns.scatterplot(x='number_of_reviews', y='price', data=berlin_data2, alpha = 0.1)
plt.title('Number of Reviews Vs Price per night\n', fontsize=15,fontweight = 'bold')
plt.xlabel('\nNumber of Reviews', fontsize=14)
plt.ylabel('Price per night\n',fontsize=14)
plt.rc('ytick', labelsize=14)
plt.rc('xtick', labelsize=13)
plt.ylim(0, 300);
plt.subplot2grid((6, 6), (0, 3),rowspan=3, colspan=3)
ax = sns.scatterplot(x='number_of_reviews', y='minimum_budget', data=berlin_data2,alpha = 0.1)
plt.title('Number of Reviews Vs Minimum Budget per night\n', fontsize=15, fontweight = 'bold')
plt.xlabel('\nNumber of Reviews', fontsize=14)
plt.ylabel('Minimum Budget per night\n',fontsize=14)
plt.rc('ytick', labelsize=13)
plt.rc('xtick', labelsize=13)
plt.ylim(0, 300);
plt.subplot2grid((6, 6), (3, 0),rowspan=3, colspan=3)
ax = sns.scatterplot(x='number_of_reviews', y='minimum_budget_inc_guests', data=berlin_data2,alpha = 0.1)
plt.title('\nNumber of Reviews Vs Minimum Budget with Extra Guests\n', fontsize=15,fontweight = 'bold')
plt.xlabel('\nNumber of Reviews', fontsize=14)
plt.ylabel('Minimum Budget with Extra Guests\n',fontsize=14)
plt.rc('ytick', labelsize=14)
plt.rc('xtick', labelsize=13)
plt.ylim(0, 300);
plt.subplot2grid((6, 6), (3, 3), rowspan=3, colspan=3)
ax = sns.scatterplot(x='number_of_reviews', y='cost_per_head', data=berlin_data2,alpha = 0.1)
plt.title('\nNumber of Reviews Vs Cost per head\n', fontsize=15,fontweight = 'bold')
plt.xlabel('\nNumber of Reviews', fontsize=14)
plt.ylabel('Cost per head\n',fontsize=14)
plt.rc('ytick', labelsize=14)
plt.rc('xtick', labelsize=13)
plt.ylim(0, 300);
plt.tight_layout();
Firstly it seems that the derived columns dont have any significant difference amongst each other and even when compared with the original price column. The only difference between the plots of price and the derived columns is that for the derived column plots the region where Number of reviews < 10 and Price > $150 the derived columns plots have a greater density of lisitngs.
This is because the initial price column is essentially being scaled in our derived columns (through the addition of price values like security deposit, etc). This translates to the 1st plot spreading into the higher price ranges but as it still has the same number of reviews value corresponding to it, this is also spread to the higher price ranges.
So lets consider our initial price column for further analysis as the others dont provide much value. But not discard them as they could be used in further analysis.
What stands out is
To analyse the Airbnb listings in Berlin which have a higher performance we will consider two metrics over which we subset our data
which will allow us to reflect upon those listings which attract more visitors and perform well respectively.
From our previous analysis we have identified the upper ranges of these values to be >= 200 and and >=90 respectively.
Lets first see the amount of lisitngs we get using the above as a filter and then judge for further analysis.
# filter for high performing listings
berlin_data_pop = berlin_data.loc[(berlin_data['number_of_reviews'] >= 200) & (berlin_data['review_scores_rating'] >= 90)]
# has 161 listings under our condition
print('\nThe high performing dataframe has {} listings '.format(berlin_data_pop.shape[0]))
plt.figure(num=9, figsize=(7,5))
plt.bar(x = berlin_data_pop.room_type.value_counts().index,
height = berlin_data_pop.room_type.value_counts().values, color=('blue','green','red'))
plt.xlabel('\nRoom Type',fontsize=13)
plt.ylabel('Number Of Listings\n',fontsize=13)
plt.rc('xtick', labelsize=11)
plt.rc('ytick', labelsize=11)
plt.title('Number of Listings per Room Type\n',fontsize=15);
Seems our previous observation still holds true across the high performing listings, although almost at parity. Shared rooms not among the popular listings with just one listing.
plt.figure(num=10, figsize=(19, 4))
sns.set_style("whitegrid", {'axes.grid' : True})
ax = sns.boxplot(x = 'price', y = 'room_type', data = berlin_data_pop, palette = ['b','g','r'], fliersize = 1.5, linewidth=1)
plt.xlim(-10, 300)
plt.title('Price Range per Room Type\n', fontsize=15)
plt.xlabel('\nPrice', fontsize=14)
plt.ylabel('')
plt.rc('ytick', labelsize=14)
plt.show();
We have a single popular listing for shared room having price ~$80.
The 'Entire Home/Apt.' listings are considerably more expensive. Each room type displays a range of prices for the high performing listings with their distributions slightly skewed to the right indicating the tendency toward lower prices.
The outliers indicate a few listings with prices above $150 per night for the entire home category. These could be premium listings. Although this gives us an overview, we cant conclude anything significant by analysing the prices. If we analyse the prices on a map we could form more relations and derive insights.
# base map creation over which we will plot the roomtypes
lat = 52.509
long = 13.381
base2 = folium.Map(location=[lat,long], zoom_start=11.5)
base2;
# defining colours for the room type markers
def colour(berlin_popular_type):
if room_type == 'Private room':
return 'blue'
elif room_type == 'Entire home/apt':
return 'green'
else:
return'red'
# determining the plot points locations and adding to the base map
berlin_popular_type = folium.map.FeatureGroup();
for lat,long,room_type in zip(berlin_data_pop.longitude, berlin_data_pop.latitude, berlin_data_pop.room_type):
berlin_popular_type = berlin_popular_type.add_child(
folium.Circle(
[long,lat],
radius = 100,
color=colour(room_type),
fill=True,
fill_color=colour(room_type),
opacity=0.4,
fill_opacity = 0.4));
base2.add_child(berlin_popular_type)
There is no discernible difference in the location placement of the different room types.
As indicated by the cluster density it looks like North central Berlin has the majority of our listings in contrast to the south eastern region as obsereved previously.
# base map for price bins at different locations
lat = 52.509
long = 13.381
base = folium.Map(location=[lat,long], zoom_start=11.5)
base;
# function to assign a colour to price for its range
def colour(price):
if (price < 50):
return '#FF9595' # light red
elif (price >=50 and price < 100):
return '#FF3939' # red
else:
return'#A91919' # dark red
# add the details to the map
berlin_popular = folium.map.FeatureGroup();
for lat,long,price in zip(berlin_data_pop.longitude, berlin_data_pop.latitude, berlin_data_pop.price):
berlin_popular = berlin_popular.add_child(
folium.Circle(
[long,lat],
radius = 100,
color=colour(price),
fill=True,
fill_color=colour(price),
opacity=0.6,
fill_opacity = 0.6));
base.add_child(berlin_popular)
fig, ax = plt.subplots(figsize=(17, 2))
fig.subplots_adjust(bottom=0.5)
# creating the colour bar for price ranges
cmap = mpl.colors.ListedColormap(['#FF9595', '#FF3939', '#A91919'])
bounds = [0,50, 100,150]
norm = mpl.colors.BoundaryNorm(bounds, cmap.N)
cb2 = mpl.colorbar.ColorbarBase(ax, cmap=cmap,
norm=norm,
#boundaries=[0] + bounds + [13],
extend='max',
ticks=bounds,
spacing='proportional',
orientation='horizontal')
cb2.set_label('\nPrice Ranges')
fig.show()
Using a diverging colour range for the prices we can see that our price ranges aren't confined to certain regions.
We can observe that the excpetionally higher priced lisitngs tend to be present closer to the center of Berlin with a few cases on the fringes of the city. It is more likely these are premium listings as their locations seem to not play a distinguishing role.
Its difficult to discern any insight from this as the different types of rooms do not have any specific clusters and are almost equally present.
plt.figure(num=11, figsize=(7,6))
plt.bar(x = berlin_data_pop.cancellation_policy.value_counts().index,
height = berlin_data_pop.cancellation_policy.value_counts().values)
plt.xlabel('\nCancellation Policy',fontsize=13)
plt.ylabel('Count\n',fontsize=13)
plt.rc('xtick', labelsize=11)
plt.rc('ytick', labelsize=11)
plt.title('Count of Cancellation policy\n',fontsize=15);
Having a flexible cancellation policy is also not an underlying theme for these listings.
# variable to hold column names that we need to analyse
measures = ['review_scores_accuracy','review_scores_cleanliness','review_scores_checkin','review_scores_communication',
'review_scores_location','review_scores_value']
# convert wide data to long format against the above measures
measures_data_tidy = pd.melt(berlin_data_pop, value_vars = measures, var_name='measures', value_name = 'measures_value')
plt.figure(num=12, figsize=(15,8), linewidth=1)
labels = ['Accuracy', 'Cleanliness','Check-in','Communication','Location','Value']
ax = sns.violinplot(x='measures', y='measures_value', data=measures_data_tidy)
plt.title('Distribution of review ratings for popular listings\n', fontweight = 'bold', fontsize=15)
plt.xlabel('\nRating type', fontsize=13)
plt.xticks([0,1,2,3,4,5,6],['Accuracy', 'Cleanliness','Check-in','Communication','Location','Value'])
plt.ylabel('Score\n',fontsize=13)
plt.ylim(7.3, 11);
Looks like it is location and value ratings that guests are more critical of, with ratings for location being the least amongst all ratings going as low as 8. Aditionally, unlike the other categories having majority score of 10, value scores are divide between 10 and 9.
Lets finally use a correlation matrix to highlight any correltaion amongst our variables if we were unable to identify it through the previous analysis
# Correlation Matrix
variables = ['accommodates','bathrooms','bedrooms','beds','price','number_of_reviews','review_scores_rating',
'review_scores_accuracy','review_scores_cleanliness','review_scores_checkin','review_scores_communication',
'review_scores_location','review_scores_value','cost_per_head','cancel_index','number_of_amenities']
plt.figure(num=13, figsize=(20,20))
corrMat = berlin_data_pop[variables].corr(method='pearson')
plt.matshow(corrMat, fignum=False, vmin=-1, vmax=1, cmap='inferno')
for i in range(len(variables)):
for j in range(len(variables)):
if corrMat.iloc[i, j] < 0:
plt.text(i, j, round(corrMat.iloc[i, j], 2), ha="center", va="center", color="white")
elif corrMat.iloc[i, j] > 0.7:
plt.text(i, j, round(corrMat.iloc[i, j], 2), ha="center", va="center", color="darkgreen")
else:
plt.text(i, j, round(corrMat.iloc[i, j], 2), ha="center", va="center", color="white")
plt.xticks(range(len(variables)), variables, rotation=90, fontsize = 15)
plt.yticks(range(len(variables)), variables, fontsize = 15)
plt.tick_params(axis='x', labelbottom=True, labeltop=False, top=False, bottom=False)
ax = plt.gca()
ax.set_xticks(np.arange(len(variables))-.5, minor=True)
ax.set_yticks(np.arange(len(variables))-.5, minor=True)
plt.tick_params(axis='x', which='both', labelbottom=True, labeltop=False, top=False, bottom=False)
plt.grid(which='minor', color='w', linestyle='-', linewidth=1)
plt.grid(False)
plt.title('Correlation Matrix - Pearson', fontsize = 20, fontweight = 'bold')
plt.colorbar();
No significant positive or negative correlation amongst variables apart from 'accommodates' and 'beds'.
From the above carried analysis we've gained an overview, but haven't obtained any solid patterns leading us to draw definite conclusions from the various levels in the data of our high performing listings.
The next approach would be to look at the user reviews to extract and hopefully get a better idea of any underlying themes if any for our poular listings.
For this lets first use a wordcloud to determine the most frequent words and use then judge the next action.
# load the reviews data
reviews = pd.read_csv('/.../berlin_reviews_summary.csv')
# create an empty list to store the listing ids that have been determined as high performing previously
pop_id = []
pop_id = berlin_data_pop.id
# use the id list to filter reviews
reviews_pop = reviews[reviews.listing_id.isin(pop_id)]
# 43053 reviews to extract from
reviews_pop
# create a function to detect the language used for the review. we need only English reviews
def language(comment):
review = str(comment)
try:
return detect(review)
except :
return 'Unknown Language'
# iterate through each review in the comments column and determine the language using function defined previously
for index, row in reviews_pop.iterrows():
lang_used = language(row['comments'])
reviews_pop = reviews_pop.set_value(index, 'language', lang_used);
# consider only english reviews
reviews_en = reviews_pop.loc[reviews_pop['language']=='en']
# verify the language column (30482 reviews to use)
reviews_en.language.unique()
reviews_en
# create an empty string to contain all our words
words = ''
for index,row in reviews_en.iterrows():
words += row['comments']
# split above words string into substrings for further processing
reviews_data = nltk.word_tokenize(words)
# create variable to filter common punctuations
punctuation = string.punctuation
# create variable to filter out the below characters and numbers
char = ['\r','\n','',' ','!',',']
numbers = ['0','1','2','3','4','5','6','7','8','9']
# create variable to filter out common english stop words which have a high frequency and may affect our word cloud
StopWords = stopwords.words('english')
# get the list of words only
words_filtered = [''.join(word for word in substr if word not in (punctuation or numbers or char)) for substr in reviews_data]
words_filtered = [substr for substr in words_filtered if substr]
comments_filtered = ' '.join(words_filtered)
comments_filtered = ' '.join([word.lower() for word in comments_filtered.split() if word not in StopWords])
wordcloud = WordCloud(width = 1000, height = 700).generate(comments_filtered);
plt.figure(num=14, figsize=(18,12));
plt.imshow(wordcloud);
plt.axis('off');
plt.show();
The word cloud allows us to see the most frequent used words in the reviews. For the high performing listings the following factors seem dominant
Now that we ave some idea of the reviews, lets try to extract topics from the reviews using topic modelling through Non Negative Matrix factorization using TFIDF. To run the model we need to first clean our data and get it in an efficient format before passing to the necessary functions for further use.
# cleaning the reviews data
# create a column for the clean values
reviews_en['comments_filtered'] = reviews_en['comments']
# remove Punctuation
reviews_en['comments_filtered'] = reviews_en['comments_filtered'].apply(lambda string: ' '.join([word for word in string.split() if word not in (punctuation or numbers or char)]))
# remove the stopwords defined earlier and convert the words to lowercase
reviews_en['comments_filtered'] = reviews_en['comments_filtered'].apply(lambda string: ' '.join([word.lower() for word in string.split() if word not in StopWords]))
# remove all other substrings apart from words
reviews_en['comments_filtered'] = reviews_en['comments_filtered'].apply(lambda string: ' '.join([word for word in string.split() if word.isalpha()]))
# lemmatize to extract base words to account for the same meaning of different words used in the reviews
Lemmatizer = WordNetLemmatizer()
reviews_en['comments_filtered'] = reviews_en['comments_filtered'].apply(lambda string: ' '.join([Lemmatizer.lemmatize(word) for word in string.split()]))
# max number of Words in Vocabulary
max_words = 2000
# create TF-IDF Vectorizer
tfidfVect = TfidfVectorizer(max_features=max_words, stop_words='english');
# tokenize, build vocab encode corpus
tf = tfidfVect.fit_transform(reviews_en['comments_filtered'])
# get features
tfidf_feature_names = tfidfVect.get_feature_names()
# assign the hyperparameter
numberTopics = 6
# run the model
nmf = NMF(n_components=numberTopics, beta_loss='kullback-leibler', solver='mu', max_iter=100,
alpha=.1, l1_ratio=.5, init='nndsvd').fit(tf)
# function to display the top words
def print_top_words(model, feature_names, n_top_words):
for topic_idx, topic in enumerate(model.components_):
message = "Topic #%d: " % topic_idx
message += " ".join([feature_names[i]
for i in topic.argsort()[:-n_top_words - 1:-1]])
print(message)
print()
# display the topics with words
numberTopWords = 25
print_top_words(nmf, tfidf_feature_names, numberTopWords)
No clear distinction between the topics excpet that a few correspond to the important themes we previously identified such as
Maybe using a different technique might provide favourable results.